Riemannian metric from manifold learning
library(tidyverse)
library(dimRed)
library(reticulate)
library(viridis)
library(hdrcde)
library(igraph)
library(matrixcalc)
library(akima)
library(car)
Jmisc::sourceAll(here::here("R/sources"))
set.seed(123)Smart meter data for one household
# load(here::here("data/spdemand_3639id336tow.rda"))
# nid <- 1
# ntow <- 336
#
# train <- spdemand %>%
# lazy_dt() %>%
# filter(tow <= ntow,
# # id <= sort(unique(spdemand[,id]))[nid],
# id == 1003) %>%
# dplyr::select(-id, -tow) %>%
# data.table::as.data.table()
train <- readRDS(here::here("data/spdemand_1id336tow_train.rds"))Metric learning
Radius searching with k-d trees
# Parameters fixed
x <- train
N <- nrow(train)
s <- 2
k <- 20
method <- "annIsomap"
annmethod <- "kdtree"
distance <- "euclidean"
treetype <- "kd"
searchtype <- "radius" # change searchtype for radius search based on `radius`, or KNN search based on `k`
radius <- .4 # the bandwidth parameter, \sqrt(\elsilon), as in algorithmThe metric learning algorithm
metric_isomap <- metricML(x, s = s, k = k, radius = radius, method = method, annmethod = annmethod, eps = 0, distance = distance, treetype = treetype, searchtype = searchtype, invert.h = TRUE)
summary(metric_isomap)## Length Class Mode
## embedding 672 -none- numeric
## rmetric 1344 -none- numeric
## weighted_graph 10 igraph list
## adj_matrix 112896 dgCMatrix S4
## laplacian 112896 dgCMatrix S4
The function metricML() returns a list of
- the embedding coordinates
embeddingof \(N \times s\) - the embedding metric
rmetricfor each point \(p \in x\) as an array - the weighted neighborhood graph as an
igraphobjectweighted_graph - the sparse adjacency matrix from the graph
adj_matrix - the graph laplacian matrix
laplacian
Check the output for each of the four steps
The metric learning algorithm consists of four steps:
- construct a weighted neighborhood graph
weighted_graph - calculate the graph laplacian
laplacian - map the data \(p \in x\) to the embedding coordinates
embedding - apply the graph laplacian to the embedding corrdinates to obtain the embedding metric
rmetric
Based on the output from megaman package, I compared the R function metricML() output for each of the four steps.
Step1: weighted graph
We compare the graph by using the weighted adjacency matrix.
metric_isomap$adj_matrix[1:6, 1:6]## 6 x 6 sparse Matrix of class "dgCMatrix"
##
## [1,] 1.0000000 0.7403658 0.4893711 . . .
## [2,] 0.7403658 1.0000000 0.8251851 0.6199676 0.4515039 0.3956647
## [3,] 0.4893711 0.8251851 1.0000000 0.8651579 0.6997442 0.5946050
## [4,] . 0.6199676 0.8651579 1.0000000 0.9248341 0.7988909
## [5,] . 0.4515039 0.6997442 0.9248341 1.0000000 0.8770268
## [6,] . 0.3956647 0.5946050 0.7988909 0.8770268 1.0000000
aff <-
feather::read_feather(here::here("data/affinity_matrix_1id336tow.feather"))
aff[1:6, 1:6]## # A tibble: 6 x 6
## `0` `1` `2` `3` `4` `5`
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 0.740 0.489 0 0 0
## 2 0.740 1 0.825 0.620 0.452 0.396
## 3 0.489 0.825 1 0.865 0.700 0.595
## 4 0 0.620 0.865 1 0.925 0.799
## 5 0 0.452 0.700 0.925 1 0.877
## 6 0 0.396 0.595 0.799 0.877 1
all.equal(as.matrix(aff), as.matrix(metric_isomap$adj_matrix), tolerance = 1e-5, check.attributes = F)## [1] TRUE
Step2: Laplacian matrix
Ln <- metric_isomap$laplacian
Ln[1:6, 1:6]## 6 x 6 sparse Matrix of class "dgCMatrix"
##
## [1,] -23.8073282 0.6096452 0.2482634 . . .
## [2,] 0.8217472 -24.2336948 0.3895801 0.2092046 0.1386494 0.1262752
## [3,] 0.4589835 0.5343428 -24.6010558 0.2466973 0.1815778 0.1603563
## [4,] . 0.3379937 0.2905886 -24.7599288 0.2020497 0.1813911
## [5,] . 0.2316924 0.2212245 0.2089848 -24.7943612 0.1874356
## [6,] . 0.2083193 0.1928744 0.1852211 0.1850419 -24.7807239
lap <-
feather::read_feather(here::here("data/laplacian_matrix_1id336tow.feather"))
lap[1:6, 1:6]## # A tibble: 6 x 6
## `0` `1` `2` `3` `4` `5`
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -23.8 0.610 0.248 0 0 0
## 2 0.822 -24.2 0.390 0.209 0.139 0.126
## 3 0.459 0.534 -24.6 0.247 0.182 0.160
## 4 0 0.338 0.291 -24.8 0.202 0.181
## 5 0 0.232 0.221 0.209 -24.8 0.187
## 6 0 0.208 0.193 0.185 0.185 -24.8
all.equal(as.matrix(lap), as.matrix(Ln), tolerance = 1e-5, check.attributes = F)## [1] TRUE
Step3: Embedding coordinates
fn <- metric_isomap$embedding
fn %>% head()## E1 E2
## [1,] 0.07615319 0.4229855
## [2,] -0.09854840 0.3503061
## [3,] -0.21017105 0.2766018
## [4,] -0.29429931 0.1997070
## [5,] -0.31819209 0.1549928
## [6,] -0.34529194 0.1165973
emb_isomap <- feather::read_feather(here::here("data/embedding_isomap_1id336tow.feather"))
emb_isomap %>% head()## # A tibble: 6 x 2
## `0` `1`
## <dbl> <dbl>
## 1 0.0725 -0.465
## 2 -0.105 -0.407
## 3 -0.204 -0.290
## 4 -0.307 -0.186
## 5 -0.354 -0.122
## 6 -0.398 -0.114
all.equal(as.matrix(emb_isomap), as.matrix(fn), tolerance = 1e-5, check.attributes = F)## [1] "Mean relative difference: 0.8138705"
par(mfrow = c(1, 2))
plot(fn, col = viridis::viridis(24), asp=1)
plot(emb_isomap$`0`, emb_isomap$`1`, col = viridis::viridis(24), asp=1)Step4: Riemannian metric
In metricML(), the invert.h argument controls whether the inverse of Riemannian metric should be returned. FALSE by default.
The megaman package returns the inverse of Riemannian matrix.
# Rn <- riemann_metric(Y = fn, laplacian = Ln, d = s, invert.h = T)
Rn <- metric_isomap$rmetric
Rn[,,1:3]## , , 1
##
## [,1] [,2]
## [1,] 0.4378822 0.0594542
## [2,] 0.0594542 0.1081907
##
## , , 2
##
## [,1] [,2]
## [1,] 0.5268865 0.1531055
## [2,] 0.1531055 0.2067521
##
## , , 3
##
## [,1] [,2]
## [1,] 0.4775869 0.1744485
## [2,] 0.1744485 0.3813567
np = import("numpy")
H_isomap <- np$load(here::here("data/hmatrix_isomap_1id336tow.npy"))
H_isomap[1,,, drop=TRUE] %>%
matrixcalc::is.positive.definite()## [1] TRUE
pyh_isomap <- array(NA, dim = c(s, s, N))
for(i in 1:N){
pyh_isomap[,,i] <- H_isomap[i,,]
}
pyh_isomap[,,1:3] # inverted## , , 1
##
## [,1] [,2]
## [1,] 0.37657276 -0.00438684
## [2,] -0.00438684 0.17479710
##
## , , 2
##
## [,1] [,2]
## [1,] 0.5200775 -0.1136964
## [2,] -0.1136964 0.2999303
##
## , , 3
##
## [,1] [,2]
## [1,] 0.4890490 -0.1653728
## [2,] -0.1653728 0.4545575
all.equal(pyh_isomap, Rn, tolerance = 1e-5, check.attributes = F)## [1] "Mean relative difference: 0.5528692"
# If we use the embedding from Python, the inverse of Riem metric is the same.
hn <- riemann_metric(Y = as.matrix(emb_isomap), laplacian = Ln, d = s, invert.h = T)
hn[,,1:3]## , , 1
##
## [,1] [,2]
## [1,] 0.37657276 -0.00438684
## [2,] -0.00438684 0.17479710
##
## , , 2
##
## [,1] [,2]
## [1,] 0.5200775 -0.1136964
## [2,] -0.1136964 0.2999303
##
## , , 3
##
## [,1] [,2]
## [1,] 0.4890490 -0.1653728
## [2,] -0.1653728 0.4545575
all.equal(pyh_isomap, hn, tolerance = 1e-5, check.attributes = F) # use python embedding## [1] TRUE
Finally, the R function metricML() gives the same/close outputs as the Python megaman package.
Ellipse plot
Using the Riemannian metric for each point as the 2-d covariance matrix and the 2-d embeddings as the centroid to plot an ellipse for each point.
The underlying requirement is that the Riemannian metric needs to be a square positive definite matrix, i.e. matrixcalc::is.positive.definite() returns TRUE.
# TODO: if add = FALSE, fix xlim, ylim, radius for generating ellipse
# TODO: add argument `n.plot` to specify how many ellipse to be added
x <- metric_isomap$embedding
cols <- viridis::viridis(24)
plot(metric_isomap$embedding, col = cols, asp=1, pch = 20)
for(i in 1:N){
mat <- Rn[,,i] # pyh_isomap[,,I]
center <- c(x[i,1], x[i,2])
# add <- ifelse(i == 1, F, T)
add <- T
car::ellipse(center, mat,
radius = .1, # controls the size of ellipse, currently set manually
# col = cols[i],
col = blues9[5],
asp = 1, pty = "s", lwd = 1, center.pch = 19, center.cex = 0,
fill = T, fill.alpha = 0.2, add = add, grid = T,
# xlim = c(-.2, .25), ylim = c(-.2, .2)
)
}Contour plots
Variable kernel density estimate
For multivariate data, the variable kernel density estimate is given by \[\hat{f}(x) = n^{-1} \sum_i K_H (x - X_i).\]
Inverse of Riemannian metric as bandwidth matrix
If we use the embedding x and the inverse of Riemannian metric h, we could produce contour plots based on the density estimates.
x <- cbind(emb_isomap$`0`, emb_isomap$`1`)
f <- mkde(x = x, h = pyh_isomap)## Time difference of 9.785535 secs
summary(f)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1160 0.2240 0.3087 0.3029 0.3916 0.4828
# Top 20 index with the lowest densities
head(order(f), n=20)## [1] 39 328 327 182 183 136 329 135 40 38 134 278 282 133 326 86 279 231 137
## [20] 281
Now apply the mkde() function to the output of metricML() written in R.
x <- metric_isomap$embedding
riem_isomap <- metric_isomap$rmetric
f <- mkde(x = x, h = riem_isomap)## Time difference of 7.821089 secs
summary(f)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.09451 0.20224 0.30641 0.28375 0.36464 0.47392
head(order(f), n=20)## [1] 39 328 327 183 182 135 282 278 329 40 134 136 38 326 231 86 133 281 137
## [20] 279
We can check the outliers based on density f. In R, the outliers are similar to the megaman output.
Riemannian metric itself as bandwidth matrix
R_isomap <- np$load(here::here("data/rmatrix_isomap_1id336tow.npy"))
# R_isomap[1,,, drop=TRUE] %>%
# matrixcalc::is.positive.definite() # not necessarily symmetric
pyr_isomap <- array(NA, dim = c(s, s, N))
for(i in 1:N){
pyr_isomap[,,i] <- R_isomap[i,,]
}
pyr_isomap[,,1:3]## , , 1
##
## [,1] [,2]
## [1,] 2.65630592 0.06666465
## [2,] 0.06666465 5.72259192
##
## , , 2
##
## [,1] [,2]
## [1,] 2.0965329 0.7947453
## [2,] 0.7947453 3.6353771
##
## , , 3
##
## [,1] [,2]
## [1,] 2.331629 0.848271
## [2,] 0.848271 2.508551
x <- cbind(emb_isomap$`0`, emb_isomap$`1`)
f <- mkde(x = x, h = pyr_isomap)## Time difference of 5.854377 secs
summary(f)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02254 0.03947 0.04608 0.04784 0.05574 0.08985
# Top 20 index with the lowest densities
head(order(f), n=20)## [1] 298 296 297 300 299 199 55 200 251 56 250 249 54 248 198 104 276 152 268
## [20] 312
p_pyr <- plot.contour(x, f, plot.hdr = F)Now apply the mkde() function to the output of metricML() written in R.
hn <- riemann_metric(Y = as.matrix(emb_isomap), laplacian = Ln, d = s, invert.h = T)
f <- mkde(x = metric_isomap$embedding, h = hn)## Time difference of 6.570726 secs
summary(f)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1031 0.2424 0.3005 0.3044 0.3933 0.4917
head(order(f), n=20)## [1] 282 183 328 39 135 237 40 136 329 182 327 283 326 281 38 232 41 331 137
## [20] 134
p_rr <- plot.contour(x, f, plot.hdr = F)t-SNE
metric_tsne <- metricML(x, s = s, k = k, radius = radius, method = "tSNE", annmethod = annmethod, eps = 0, distance = distance, treetype = treetype, searchtype = searchtype, invert.h = TRUE)## Warning in matchPars(methodObject, list(...)): Parameter matching: knn is not a
## standard parameter, ignoring.
## Warning in matchPars(methodObject, list(...)): Parameter matching: annmethod is
## not a standard parameter, ignoring.
## Warning in matchPars(methodObject, list(...)): Parameter matching: radius is not
## a standard parameter, ignoring.
## Warning in matchPars(methodObject, list(...)): Parameter matching: eps is not a
## standard parameter, ignoring.
## Warning in matchPars(methodObject, list(...)): Parameter matching: nt is not a
## standard parameter, ignoring.
## Warning in matchPars(methodObject, list(...)): Parameter matching: nlinks is not
## a standard parameter, ignoring.
## Warning in matchPars(methodObject, list(...)): Parameter matching:
## ef.construction is not a standard parameter, ignoring.
## Warning in matchPars(methodObject, list(...)): Parameter matching: distance is
## not a standard parameter, ignoring.
## Warning in matchPars(methodObject, list(...)): Parameter matching: treetype is
## not a standard parameter, ignoring.
## Warning in matchPars(methodObject, list(...)): Parameter matching: searchtype is
## not a standard parameter, ignoring.
summary(metric_tsne)## Length Class Mode
## embedding 672 -none- numeric
## rmetric 1344 -none- numeric
## weighted_graph 10 igraph list
## adj_matrix 112896 dgCMatrix S4
## laplacian 112896 dgCMatrix S4
f <- mkde(x = metric_tsne$embedding, h = metric_tsne$rmetric)## Time difference of 8.543279 secs
summary(f)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0001540 0.0003175 0.0003758 0.0004688 0.0006294 0.0013705
head(order(f), n=20)## [1] 69 212 213 157 165 70 68 116 117 214 164 158 118 71 215 156 166 109 216
## [20] 61
plot.contour(x = metric_tsne$embedding, f, plot.hdr = F)## $p
## NULL